/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is BasicIndexer.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
* Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
*/
package org.terrier.indexing;
import gnu.trove.TIntHashSet;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import org.terrier.compression.BitIn;
import org.terrier.structures.BasicDocumentIndexEntry;
import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.DirectInvertedOutputStream;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FieldDirectInvertedOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.DocumentPostingList;
import org.terrier.structures.indexing.FieldDocumentPostingList;
import org.terrier.structures.indexing.FieldLexiconMap;
import org.terrier.structures.indexing.InvertedIndexBuilder;
import org.terrier.structures.indexing.LexiconBuilder;
import org.terrier.structures.indexing.LexiconMap;
import org.terrier.terms.TermPipeline;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.TermCodes;
/**
* BasicIndexer is the default indexer for Terrier. It takes
* terms from each Document object provided by the collection, and
* adds terms to temporary Lexicons, and into the DirectFile.
* The documentIndex is updated to give the pointers into the Direct
* file. The temporary lexicons are then merged into the main lexicon.
* Inverted Index construction takes place as a second step.
* <br>
* <b>Properties:</b>
* <ul>
* <li><tt>indexing.max.encoded.documentindex.docs</tt> - how many docs before the DocumentIndexEncoded is dropped in favour of the DocumentIndex (on disk implementation).
* <li><tt>string.use_utf</tt> - use the UTF index structures?
* <li><i>See Also: Properties in </i><a href="Indexer.html">org.terrier.indexing.Indexer</a> <i>and</i> <a href="BlockIndexer.html">org.terrier.indexing.BlockIndexer</a></li>
* </ul>
* @author Craig Macdonald & Vassilis Plachouras
* @see org.terrier.indexing.Indexer
* @see org.terrier.indexing.BlockIndexer
*/
public class BasicIndexer extends Indexer
{
/**
* This class implements an end of a TermPipeline that adds the
* term to the DocumentTree. This TermProcessor does NOT have field
* support.
*/
protected class BasicTermProcessor implements TermPipeline
{
//term pipeline implementation
public void processTerm(String term)
{
/* null means the term has been filtered out (eg stopwords) */
if (term != null)
{
//add term to thingy tree
termsInDocument.insert(term);
numOfTokensInDocument++;
}
}
public boolean reset() {
return true;
}
}
/** This class implements an end of a TermPipeline that adds the
* term to the DocumentTree. This TermProcessor does have field
* support.
*/
protected class FieldTermProcessor implements TermPipeline
{
final TIntHashSet fields = new TIntHashSet(numFields);
final boolean ELSE_ENABLED = fieldNames.containsKey("ELSE");
final int ELSE_FIELD_ID = fieldNames.get("ELSE") -1;
public void processTerm(String term)
{
/* null means the term has been filtered out (eg stopwords) */
if (term != null)
{
/* add term to Document tree */
for (String fieldName: termFields)
{
int tmp = fieldNames.get(fieldName);
if (tmp > 0)
{
fields.add(tmp -1);
}
}
if (ELSE_ENABLED && fields.size() == 0)
{
fields.add(ELSE_FIELD_ID);
}
((FieldDocumentPostingList)termsInDocument).insert(term,fields.toArray());
numOfTokensInDocument++;
fields.clear();
}
}
public boolean reset() {
return true;
}
}
/**
* A private variable for storing the fields a term appears into.
*/
protected Set<String> termFields;
/**
* The structure that holds the terms found in a document.
*/
protected DocumentPostingList termsInDocument;
/**
* The number of tokens found in the current document so far/
*/
protected int numOfTokensInDocument = 0;
/** Protected do-nothing constructor for use by child classes. Classes which
* use this method must call init() */
protected BasicIndexer(long a, long b, long c) {
super(a,b,c);
}
/**
* Constructs an instance of a BasicIndexer, using the given path name
* for storing the data structures.
* @param path String the path where the data structures will be created. This is assumed to be
* absolute.
* @param prefix String the filename component of the data structures
*/
public BasicIndexer(String path, String prefix) {
super(path, prefix);
//delay the execution of init() if we are a parent class
if (this.getClass() == BasicIndexer.class)
init();
}
/**
* Returns the end of the term pipeline, which corresponds to
* an instance of either BasicIndexer.BasicTermProcessor, or
* BasicIndexer.FieldTermProcessor, depending on whether
* field information is stored.
* @return TermPipeline the end of the term pipeline.
*/
protected TermPipeline getEndOfPipeline()
{
if(FieldScore.USE_FIELD_INFORMATION)
return new FieldTermProcessor();
return new BasicTermProcessor();
}
/**
* Creates the direct index, the document index and the lexicon.
* Loops through each document in each of the collections,
* extracting terms and pushing these through the Term Pipeline
* (eg stemming, stopping, lowercase).
* @param collections Collection[] the collections to be indexed.
*/
public void createDirectIndex(Collection[] collections)
{
currentIndex = Index.createNewIndex(path, prefix);
lexiconBuilder = FieldScore.FIELDS_COUNT > 0
? new LexiconBuilder(currentIndex, "lexicon", new FieldLexiconMap(FieldScore.FIELDS_COUNT), FieldLexiconEntry.class.getName())
: new LexiconBuilder(currentIndex, "lexicon", new LexiconMap(), BasicLexiconEntry.class.getName());
try{
directIndexBuilder = FieldScore.FIELDS_COUNT > 0
? new FieldDirectInvertedOutputStream(currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION)
: new DirectInvertedOutputStream(currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION);
} catch (IOException ioe) {
// logger.error("Cannot make DirectInvertedOutputStream:", ioe);
}
// new DirectIndexBuilder(currentIndex, "direct");
docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
metaBuilder = createMetaIndexBuilder();
emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry();
//int LexiconCount = 0;
int numberOfDocuments = 0; int numberOfTokens = 0;
//final long startBunchOfDocuments = System.currentTimeMillis();
final int collections_length = collections.length;
final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
boolean stopIndexing = false;
for(int collectionNo = 0; ! stopIndexing && collectionNo < collections_length; collectionNo++)
{
final Collection collection = collections[collectionNo];
long startCollection = System.currentTimeMillis();
boolean notLastDoc = false;
//while(notLastDoc = collection.hasNext()) {
while ((notLastDoc = collection.nextDocument())) {
//get the next document from the collection
//String docid = collection.getDocid();
//Document doc = collection.next();
Document doc = collection.getDocument();
if (doc == null)
continue;
numberOfDocuments++;
/* setup for parsing */
createDocumentPostings();
String term; //term we're currently processing
numOfTokensInDocument = 0;
//get each term in the document
while (!doc.endOfDocument()) {
if ((term = doc.getNextTerm())!=null && !term.equals("")) {
termFields = doc.getFields();
/* pass term into TermPipeline (stop, stem etc) */
pipeline_first.processTerm(term);
/* the term pipeline will eventually add the term to this object. */
}
if (MAX_TOKENS_IN_DOCUMENT > 0 &&
numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
break;
}
//if we didn't index all tokens from document,
//we need to get to the end of the document.
while (!doc.endOfDocument())
doc.getNextTerm();
pipeline_first.reset();
/* we now have all terms in the DocumentTree, so we save the document tree */
try
{
if (termsInDocument.getDocumentLength() == 0)
{ /* this document is empty, add the minimum to the document index */
indexEmpty(doc.getAllProperties());
}
else
{ /* index this docuent */
numberOfTokens += numOfTokensInDocument;
indexDocument(doc.getAllProperties(), termsInDocument);
}
}
catch (Exception ioe)
{
// logger.error("Failed to index "+doc.getProperty("docno"),ioe);
}
if (MAX_DOCS_PER_BUILDER>0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER)
{
stopIndexing = true;
break;
}
if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno")))
{
// //logger.warn("Document "+doc.getProperty("docno")+" is a builder boundary document. Boundary forced.");
stopIndexing = true;
break;
}
}
if (! notLastDoc)
{
try{
collection.close();
} catch (IOException e) {
// //logger.warn("Couldnt close collection", e);
}
}
long endCollection = System.currentTimeMillis();
long secs = ((endCollection-startCollection)/1000);
// //logger.info("Collection #"+collectionNo+ " took "+secs+" seconds to index "
// +"("+numberOfDocuments+" documents)");
// if (secs > 3600)
// //logger.info("Rate: "+((double)numberOfDocuments/((double)secs/3600.0d))+" docs/hour");
}
finishedDirectIndexBuild();
/*end of all the collections has been reached */
/* flush the index buffers */
currentIndex.addIndexStructure(
"direct",
"org.terrier.structures.DirectIndex",
"org.terrier.structures.Index,java.lang.String,java.lang.Class",
"index,structureName,"+
(FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
currentIndex.addIndexStructureInputStream(
"direct",
"org.terrier.structures.DirectIndexInputStream",
"org.terrier.structures.Index,java.lang.String,java.lang.Class",
"index,structureName,"+
(FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
currentIndex.setIndexProperty("index.direct.fields.count", ""+FieldScore.FIELDS_COUNT );
currentIndex.setIndexProperty("index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
//directIndexBuilder.finishedCollections();
directIndexBuilder.close();
docIndexBuilder.finishedCollections();
if (FieldScore.FIELDS_COUNT > 0)
{
currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}");
}
else
{
currentIndex.addIndexStructure("document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
}
try{
metaBuilder.close();
} catch (IOException ioe) {
// logger.error("Could not finish MetaIndexBuilder: ", ioe);
}
/* and then merge all the temporary lexicons */
lexiconBuilder.finishedDirectIndexBuild();
currentIndex.setIndexProperty("num.Tokens", ""+numberOfTokens);
if (FieldScore.FIELDS_COUNT > 0)
{
currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}");
}
/* reset the in-memory mapping of terms to term codes.*/
TermCodes.reset();
/* and clear them out of memory */
System.gc();
/* record the fact that these data structures are complete */
try{
currentIndex.flush();
} catch (IOException ioe) {
// logger.error("Problem flushing changes to index", ioe);
}
}
/**
* This adds a document to the direct and document indexes, as well
* as it's terms to the lexicon. Handled internally by the methods
* indexFieldDocument and indexNoFieldDocument.
* @param docProperties Map<String,String> properties of the document
* @param _termsInDocument DocumentPostingList the terms in the document.
*
*/
protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception
{
/* add words to lexicontree */
lexiconBuilder.addDocumentTerms(_termsInDocument);
/* add doc postings to the direct index */
BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2());
//.addDocument(termsInDocument.getPostings());
/* add doc to documentindex */
DocumentIndexEntry die = _termsInDocument.getDocumentStatistics();
die.setBitIndexPointer(dirIndexPost);
docIndexBuilder.addEntryToBuffer(die);
/** add doc metadata to index */
metaBuilder.writeDocumentEntry(docProperties);
}
/**
* Creates the inverted index after having created the
* direct index, document index and lexicon.
*/
public void createInvertedIndex() {
if (currentIndex == null)
{
currentIndex = Index.createIndex(path,prefix);
if (currentIndex == null)
{
logger.error("No index at ("+path+","+prefix+") to build an inverted index for ");
}
}
final long beginTimestamp = System.currentTimeMillis();
//logger.info("Started building the inverted index...");
if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0)
{
logger.error("Index has no terms. Inverted index creation aborted.");
return;
}
if (currentIndex.getCollectionStatistics().getNumberOfDocuments() == 0)
{
logger.error("Index has no documents. Inverted index creation aborted.");
return;
}
//generate the inverted index
//logger.info("Started building the inverted index...");
invertedIndexBuilder = new InvertedIndexBuilder(currentIndex, "inverted");
invertedIndexBuilder.createInvertedIndex();
finishedInvertedIndexBuild();
long endTimestamp = System.currentTimeMillis();
//logger.info("Finished building the inverted index...");
long seconds = (endTimestamp - beginTimestamp) / 1000;
//long minutes = seconds / 60;
//logger.info("Time elapsed for inverted file: " + seconds);
try{
currentIndex.flush();
} catch (IOException ioe) {
//logger.warn("Problem flushin index", ioe);
}
}
/**
* Hook method that creates the right type of DocumentTree class.
*/
protected void createDocumentPostings(){
if (FieldScore.FIELDS_COUNT > 0)
termsInDocument = new FieldDocumentPostingList(FieldScore.FIELDS_COUNT);
else
termsInDocument = new DocumentPostingList();
}
/** Hook method, called when the inverted index is finished - ie the lexicon is finished */
protected void finishedInvertedIndexBuild()
{
LexiconBuilder.optimise(currentIndex, "lexicon");
}
}